1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

1.2 The Data

trainLabeled <- read.delim("~/GitHub/FCA/Data/trainSet.txt")
validLabeled <- read.delim("~/GitHub/FCA/Data/arcene_valid.txt")
wholeArceneSet <- rbind(trainLabeled,validLabeled)


wholeArceneSet$Labels <-  1*(wholeArceneSet$Labels > 0)
wholeArceneSet[,1:ncol(trainLabeled)] <- sapply(wholeArceneSet,as.double)

1.2.0.1 Standarize the names for the reporting

studyName <- "ARCENE"
dataframe <- wholeArceneSet
outcome <- "Labels"
thro <- 0.8
cexheat = 0.10

TopVariables <- 10

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
200 10000
pander::pander(table(dataframe[,outcome]))
0 1
112 88

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) >= 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 7748 , Uni p: 2.565681e-05 , Outcome-Driven Size: 0 , Base Size: 966 , Rcrit: 0.2822717 
#> 
#> 
 1 <R=1.000,thr=0.900,N= 6808>, Top: 122( 305 ).[ 1 : 122 Fa= 122 : 0.900 ]( 122 , 2729 , 0 ),<|>Tot Used: 2851 , Added: 2729 , Zero Std: 0 , Max Cor: 1.000
#> 
 2 <R=1.000,thr=0.900,N= 6808>, Top: 435( 53 )....=( 1 )[ 2 : 435 Fa= 556 : 0.919 ]( 435 , 3567 , 122 ),<|>Tot Used: 4904 , Added: 3567 , Zero Std: 0 , Max Cor: 1.000
#> 
 3 <R=1.000,thr=0.900,N= 6808>, Top: 808( 50 )........[ 1 : 808 Fa= 1357 : 0.900 ]( 805 , 3459 , 556 ),<|>Tot Used: 5945 , Added: 3459 , Zero Std: 0 , Max Cor: 1.000
#> 
 4 <R=1.000,thr=0.900,N= 6808>, Top: 815( 16 )........=[ 2 : 815 Fa= 2151 : 0.915 ]( 811 , 2354 , 1357 ),<|>Tot Used: 6390 , Added: 2354 , Zero Std: 0 , Max Cor: 1.000
#> 
 5 <R=1.000,thr=0.900,N= 6808>, Top: 512( 7 ).....[ 1 : 512 Fa= 2644 : 0.900 ]( 510 , 1233 , 2151 ),<|>Tot Used: 6641 , Added: 1233 , Zero Std: 0 , Max Cor: 0.999
#> 
 6 <R=0.999,thr=0.900,N= 6808>, Top: 254( 5 )..=( 1 )[ 2 : 254 Fa= 2889 : 0.919 ]( 251 , 483 , 2644 ),<|>Tot Used: 6731 , Added: 483 , Zero Std: 0 , Max Cor: 0.999
#> 
 7 <R=0.999,thr=0.900,N= 6808>, Top: 97( 2 )[ 1 : 97 Fa= 2967 : 0.927 ]( 91 , 138 , 2889 ),<|>Tot Used: 6757 , Added: 138 , Zero Std: 0 , Max Cor: 0.998
#> 
 8 <R=0.998,thr=0.900,N= 6808>, Top: 28( 1 )=[ 2 : 28 Fa= 2984 : 0.913 ]( 25 , 37 , 2967 ),<|>Tot Used: 6757 , Added: 37 , Zero Std: 0 , Max Cor: 0.989
#> 
 9 <R=0.989,thr=0.900,N= 6808>, Top: 8( 2 )[ 1 : 8 Fa= 2990 : 0.900 ]( 7 , 8 , 2984 ),<|>Tot Used: 6760 , Added: 8 , Zero Std: 0 , Max Cor: 0.994
#> 
 10 <R=0.994,thr=0.900,N= 6808>, Top: 2( 2 )[ 1 : 2 Fa= 2990 : 0.900 ]( 1 , 2 , 2990 ),<|>Tot Used: 6760 , Added: 2 , Zero Std: 0 , Max Cor: 0.952
#> 
 11 <R=0.952,thr=0.900,N= 6808>, Top: 1( 1 )[ 1 : 1 Fa= 2991 : 0.900 ]( 1 , 1 , 2990 ),<|>Tot Used: 6760 , Added: 1 , Zero Std: 0 , Max Cor: 0.900
#> 
 12 <R=0.900,thr=0.800,N= 2673>, Top: 1011( 1 ).........=( 1 )[ 2 : 1011 Fa= 3318 : 0.873 ]( 967 , 1283 , 2991 ),<|>Tot Used: 6802 , Added: 1283 , Zero Std: 0 , Max Cor: 0.997
#> 
 13 <R=0.997,thr=0.900,N=  292>, Top: 142( 1 ).[ 1 : 142 Fa= 3351 : 0.900 ]( 141 , 148 , 3318 ),<|>Tot Used: 6802 , Added: 148 , Zero Std: 0 , Max Cor: 0.995
#> 
 14 <R=0.995,thr=0.900,N=  292>, Top: 36( 1 )[ 1 : 36 Fa= 3368 : 0.900 ]( 36 , 36 , 3351 ),<|>Tot Used: 6802 , Added: 36 , Zero Std: 0 , Max Cor: 0.936
#> 
 15 <R=0.936,thr=0.900,N=  292>, Top: 2( 1 )[ 1 : 2 Fa= 3370 : 0.900 ]( 2 , 2 , 3368 ),<|>Tot Used: 6802 , Added: 2 , Zero Std: 0 , Max Cor: 0.900
#> 
 16 <R=0.900,thr=0.800,N=  750>, Top: 243( 2 )..=( 1 )[ 2 : 243 Fa= 3419 : 0.820 ]( 232 , 341 , 3370 ),<|>Tot Used: 6808 , Added: 341 , Zero Std: 0 , Max Cor: 0.984
#> 
 17 <R=0.984,thr=0.900,N=   62>, Top: 31( 1 )[ 1 : 31 Fa= 3429 : 0.900 ]( 31 , 31 , 3419 ),<|>Tot Used: 6808 , Added: 31 , Zero Std: 0 , Max Cor: 0.972
#> 
 18 <R=0.972,thr=0.900,N=   62>, Top: 5( 1 )[ 1 : 5 Fa= 3432 : 0.900 ]( 5 , 5 , 3429 ),<|>Tot Used: 6808 , Added: 5 , Zero Std: 0 , Max Cor: 0.955
#> 
 19 <R=0.955,thr=0.900,N=   62>, Top: 1( 1 )[ 1 : 1 Fa= 3433 : 0.900 ]( 1 , 1 , 3432 ),<|>Tot Used: 6808 , Added: 1 , Zero Std: 0 , Max Cor: 0.900
#> 
 20 <R=0.900,thr=0.800,N=  256>, Top: 79( 1 )[ 1 : 79 Fa= 3444 : 0.800 ]( 74 , 119 , 3433 ),<|>Tot Used: 6813 , Added: 119 , Zero Std: 0 , Max Cor: 0.986
#> 
 21 <R=0.986,thr=0.900,N=   22>, Top: 11( 1 )[ 1 : 11 Fa= 3446 : 0.900 ]( 11 , 11 , 3444 ),<|>Tot Used: 6813 , Added: 11 , Zero Std: 0 , Max Cor: 0.921
#> 
 22 <R=0.921,thr=0.900,N=   22>, Top: 1( 1 )[ 1 : 1 Fa= 3446 : 0.900 ]( 1 , 1 , 3446 ),<|>Tot Used: 6813 , Added: 1 , Zero Std: 0 , Max Cor: 0.892
#> 
 23 <R=0.892,thr=0.800,N=   73>, Top: 27( 1 )[ 1 : 27 Fa= 3450 : 0.800 ]( 26 , 39 , 3446 ),<|>Tot Used: 6815 , Added: 39 , Zero Std: 0 , Max Cor: 0.983
#> 
 24 <R=0.983,thr=0.900,N=    8>, Top: 4( 1 )[ 1 : 4 Fa= 3451 : 0.900 ]( 4 , 4 , 3450 ),<|>Tot Used: 6815 , Added: 4 , Zero Std: 0 , Max Cor: 0.905
#> 
 25 <R=0.905,thr=0.900,N=    8>, Top: 1( 1 )[ 1 : 1 Fa= 3452 : 0.900 ]( 1 , 1 , 3451 ),<|>Tot Used: 6815 , Added: 1 , Zero Std: 0 , Max Cor: 0.892
#> 
 26 <R=0.892,thr=0.800,N=    6>, Top: 3( 1 )[ 1 : 3 Fa= 3453 : 0.800 ]( 3 , 3 , 3452 ),<|>Tot Used: 6815 , Added: 3 , Zero Std: 0 , Max Cor: 0.959
#> 
 27 <R=0.959,thr=0.900,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 3453 : 0.900 ]( 1 , 1 , 3453 ),<|>Tot Used: 6815 , Added: 1 , Zero Std: 0 , Max Cor: 0.925
#> 
 28 <R=0.925,thr=0.900,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 3453 : 0.900 ]( 1 , 1 , 3453 ),<|>Tot Used: 6815 , Added: 1 , Zero Std: 0 , Max Cor: 0.867
#> 
 29 <R=0.867,thr=0.800,N=    2>, Top: 1( 1 )[ 1 : 1 Fa= 3453 : 0.800 ]( 1 , 1 , 3453 ),<|>Tot Used: 6815 , Added: 1 , Zero Std: 0 , Max Cor: 0.800
#> 
 30 <R=0.800,thr=0.800,N=    2>
#> 
 [ 30 ], 0.7999954 Decor Dimension: 6815 Nused: 6815 . Cor to Base: 3366 , ABase: 39 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

63594442

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

6608460

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

3.08

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

1.63

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPLTM <- attr(DEdataframe,"UPLTM")
  
  gplots::heatmap.2(1.0*(abs(UPLTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data


if (nrow(dataframe) < 1000)
{
  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  datasetframe.umap = umap(scale(dataframe[1:numsub,varlist]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
}

1.8.2 The decorralted UMAP

if (nrow(dataframe) < 1000)
{

  datasetframe.umap = umap(scale(DEdataframe[1:numsub,varlistc]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
}

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : V100 200 : V201 300 : V302 400 : V402 500 : V504
600 : V606 700 : V707 800 : V807 900 : V907 1000 : V1008
1100 : V1108 1200 : V1209 1300 : V1309 1400 : V1409 1500 : V1509
1600 : V1610 1700 : V1710 1800 : V1810 1900 : V1911 2000 : V2012
2100 : V2113 2200 : V2213 2300 : V2313 2400 : V2417 2500 : V2518
2600 : V2620 2700 : V2722 2800 : V2822 2900 : V2922 3000 : V3023
3100 : V3123 3200 : V3223 3300 : V3326 3400 : V3428 3500 : V3528
3600 : V3629 3700 : V3734 3800 : V3835 3900 : V3935 4000 : V4038
4100 : V4140 4200 : V4243 4300 : V4344 4400 : V4445 4500 : V4547
4600 : V4649 4700 : V4751 4800 : V4853 4900 : V4954 5000 : V5055
5100 : V5156 5200 : V5256 5300 : V5360 5400 : V5462 5500 : V5564
5600 : V5666 5700 : V5768 5800 : V5868 5900 : V5970 6000 : V6070
6100 : V6171 6200 : V6271 6300 : V6372 6400 : V6473 6500 : V6573
6600 : V6675 6700 : V6777 6800 : V6881 6900 : V6983 7000 : V7088
7100 : V7190 7200 : V7291 7300 : V7393 7400 : V7496 7500 : V7597
7600 : V7701 7700 : V7803 7800 : V7904 7900 : V8007 8000 : V8108
8100 : V8209 8200 : V8310 8300 : V8414 8400 : V8516 8500 : V8620
8600 : V8721 8700 : V8822 8800 : V8925 8900 : V9026 9000 : V9128
9100 : V9232 9200 : V9332 9300 : V9433 9400 : V9533 9500 : V9638
9600 : V9739 9700 : V9841 9800 : V9944




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : V100 200 : La_V201 300 : V302 400 : La_V402 500 : V504
600 : V606 700 : V707 800 : V807 900 : La_V907 1000 : La_V1008
1100 : La_V1108 1200 : V1209 1300 : La_V1309 1400 : La_V1409 1500 : La_V1509
1600 : La_V1610 1700 : La_V1710 1800 : La_V1810 1900 : La_V1911 2000 : La_V2012
2100 : La_V2113 2200 : La_V2213 2300 : V2313 2400 : La_V2417 2500 : La_V2518
2600 : V2620 2700 : V2722 2800 : V2822 2900 : V2922 3000 : La_V3023
3100 : La_V3123 3200 : La_V3223 3300 : V3326 3400 : V3428 3500 : La_V3528
3600 : La_V3629 3700 : La_V3734 3800 : La_V3835 3900 : La_V3935 4000 : La_V4038
4100 : La_V4140 4200 : La_V4243 4300 : V4344 4400 : V4445 4500 : La_V4547
4600 : La_V4649 4700 : La_V4751 4800 : V4853 4900 : La_V4954 5000 : La_V5055
5100 : V5156 5200 : La_V5256 5300 : V5360 5400 : La_V5462 5500 : La_V5564
5600 : La_V5666 5700 : La_V5768 5800 : La_V5868 5900 : La_V5970 6000 : La_V6070
6100 : La_V6171 6200 : La_V6271 6300 : La_V6372 6400 : La_V6473 6500 : V6573
6600 : La_V6675 6700 : La_V6777 6800 : V6881 6900 : La_V6983 7000 : La_V7088
7100 : La_V7190 7200 : V7291 7300 : La_V7393 7400 : La_V7496 7500 : La_V7597
7600 : La_V7701 7700 : La_V7803 7800 : La_V7904 7900 : V8007 8000 : La_V8108
8100 : La_V8209 8200 : La_V8310 8300 : La_V8414 8400 : La_V8516 8500 : V8620
8600 : La_V8721 8700 : La_V8822 8800 : La_V8925 8900 : La_V9026 9000 : V9128
9100 : La_V9232 9200 : La_V9332 9300 : La_V9433 9400 : V9533 9500 : La_V9638
9600 : La_V9739 9700 : La_V9841 9800 : V9944

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##top variables
topvar <- c(1:length(varlist)) <= TopVariables
tableRaw <- univarRAW$orderframe[topvar,univariate_columns]
pander::pander(tableRaw)
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
V5005 314.7 72.9 239 83.6 0.18466 0.772
V4960 47.5 49.3 124 97.3 0.19534 0.751
V2309 43.1 45.3 113 89.0 0.18665 0.751
V8368 44.9 46.1 116 90.6 0.21091 0.751
V312 47.2 48.0 122 94.7 0.21678 0.750
V3365 46.3 46.9 119 92.5 0.22139 0.749
V9617 40.9 44.7 109 87.5 0.15591 0.749
V414 47.5 50.4 125 100.4 0.16265 0.749
V9092 33.9 63.1 124 132.6 0.00199 0.748
V1936 316.0 79.5 243 79.2 0.28495 0.748


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]


pander::pander(finalTable)
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
La_V5357 11.737 11.44 -0.913 9.88 1.82e-03 0.792
La_V6838 11.780 6.29 2.208 11.57 3.11e-07 0.778
La_V2185 19.065 31.48 -6.343 25.57 1.54e-04 0.777
La_V571 -11.330 3.74 -7.865 3.90 1.51e-01 0.772
La_V2747 37.862 27.41 13.202 23.24 6.29e-03 0.766
La_V5931 -22.157 40.35 22.116 46.02 1.06e-04 0.764
La_V316 0.699 1.40 -0.609 1.64 5.75e-03 0.764
La_V298 -12.286 29.55 12.450 27.56 2.34e-04 0.748
La_V9281 2.079 3.97 -4.438 10.19 5.92e-09 0.747
V5801 503.182 107.04 385.036 144.30 1.40e-01 0.744

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")


pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
3.5 6562 0.666

theCharformulas <- attr(dc,"LatentCharFormulas")


finalTable <- rbind(finalTable,tableRaw[topvar[!(topvar %in% topLAvar)],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- theCharformulas[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
La_V5357 - (1.068)V3417 + V5357 - (0.034)V9171 11.737 11.44 -0.913 9.88 1.82e-03 0.792 0.603 -1
La_V6838 + (0.013)V1474 - (1.016)V5947 + V6838 11.780 6.29 2.208 11.57 3.11e-07 0.778 0.670 0
La_V2185 - (0.723)V1380 + V2185 19.065 31.48 -6.343 25.57 1.54e-04 0.777 0.592 0
La_V571 - (0.829)V405 + V571 - (0.203)V7374 -11.330 3.74 -7.865 3.90 1.51e-01 0.772 0.598 -1
V5005 NA 314.739 72.85 239.304 83.62 1.85e-01 0.772 0.772 NA
La_V2747 + V2747 - (0.906)V3001 37.862 27.41 13.202 23.24 6.29e-03 0.766 0.624 2
La_V5931 - (0.662)V4487 + V5931 -22.157 40.35 22.116 46.02 1.06e-04 0.764 0.517 13
La_V316 + V316 - (0.355)V1380 + (1.902)V6976 - (2.551)V8743 0.699 1.40 -0.609 1.64 5.75e-03 0.764 0.545 -3
V4960 NA 47.511 49.25 123.696 97.33 1.95e-01 0.751 0.751 NA
V2309 NA 43.057 45.32 112.616 89.00 1.87e-01 0.751 0.751 NA
V8368 NA 44.886 46.06 116.036 90.62 2.11e-01 0.751 0.751 NA
V312 NA 47.182 48.01 121.634 94.73 2.17e-01 0.750 0.750 NA
V3365 NA 46.261 46.93 119.054 92.52 2.21e-01 0.749 0.749 NA
V9617 NA 40.886 44.75 108.848 87.49 1.56e-01 0.749 0.749 NA
V414 NA 47.466 50.45 125.348 100.36 1.63e-01 0.749 0.749 NA
V9092 NA 33.886 63.11 123.607 132.62 1.99e-03 0.748 0.748 NA
La_V298 + V298 - (0.713)V4487 -12.286 29.55 12.450 27.56 2.34e-04 0.748 0.549 1
V1936 NA 315.955 79.48 243.062 79.21 2.85e-01 0.748 0.748 NA
La_V9281 - (0.026)V403 - (1.043)V1070 + V9281 2.079 3.97 -4.438 10.19 5.92e-09 0.747 0.608 1
V5801 NA 503.182 107.04 385.036 144.30 1.40e-01 0.744 0.744 10

1.10 Comparing IDeA vs PCA vs EFA

1.10.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,scale. = TRUE)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

1.10.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

1.11 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 109 3
1 13 75
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.920 0.873 0.954
3 se 0.852 0.761 0.919
4 sp 0.973 0.924 0.994
6 diag.or 209.615 57.738 761.000

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe,control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="IDeA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 107 5
1 10 78
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.925 0.879 0.957
3 se 0.886 0.801 0.944
4 sp 0.955 0.899 0.985
6 diag.or 166.920 54.874 507.747

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 104 8
1 39 49
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.765 0.700 0.822
3 se 0.557 0.447 0.663
4 sp 0.929 0.864 0.969
6 diag.or 16.333 7.100 37.573


par(op)

1.11.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }


  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 109 3
1 13 75
  pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.920 0.873 0.954
3 se 0.852 0.761 0.919
4 sp 0.973 0.924 0.994
6 diag.or 209.615 57.738 761.000
  par(op)